1 Objective

This notebook has an objective to analyze athe information given by Argentinian government on the evolution of the current COVID19 pandemic.

The dataset used can be found at http://datos.salud.gob.ar/dataset/covid-19-casos-registrados-en-la-republica-argentina/archivo/fd657d02-a33a-498b-a91b-2ef1a68b8d16

It is a CSV file containing a row for each individual person that was suspected of having covid.

2 Loading the data

2.1 Loading the libraries we will use.

library(tidyr)
library(dplyr)
library(ggplot2)
library(magrittr)
library(leaflet)
library(rgdal)

2.2 Dataset loading

cases <- read.csv("Covid19Casos.csv")

3 Exploring the dataset

3.1 Taking a look at the formatting

glimpse(cases)
## Rows: 4,543,501
## Columns: 25
## $ id_evento_caso                   <int> 1000000, 1000002, 1000003, 1000005, …
## $ sexo                             <chr> "M", "M", "F", "F", "M", "M", "F", "…
## $ edad                             <int> 53, 21, 41, 58, 28, 26, 69, 73, 7, 4…
## $ edad_años_meses                  <chr> "Años", "Años", "Años", "Años", "Año…
## $ residencia_pais_nombre           <chr> "Argentina", "Argentina", "Argentina…
## $ residencia_provincia_nombre      <chr> "CABA", "Buenos Aires", "CĂ³rdoba", "…
## $ residencia_departamento_nombre   <chr> "SIN ESPECIFICAR", "La Matanza", "Ca…
## $ carga_provincia_nombre           <chr> "Buenos Aires", "Buenos Aires", "CĂ³r…
## $ fecha_inicio_sintomas            <chr> "", "", "2020-05-24", "", "2020-05-3…
## $ fecha_apertura                   <chr> "2020-06-01", "2020-06-01", "2020-06…
## $ sepi_apertura                    <int> 23, 23, 23, 23, 23, 23, 23, 23, 23, …
## $ fecha_internacion                <chr> "", "", "", "", "", "", "", "", "", …
## $ cuidado_intensivo                <chr> "NO", "NO", "NO", "NO", "NO", "NO", …
## $ fecha_cui_intensivo              <chr> "", "", "", "", "", "", "", "", "", …
## $ fallecido                        <chr> "NO", "NO", "NO", "NO", "NO", "NO", …
## $ fecha_fallecimiento              <chr> "", "", "", "", "", "", "", "", "", …
## $ asistencia_respiratoria_mecanica <chr> "NO", "NO", "NO", "NO", "NO", "NO", …
## $ carga_provincia_id               <int> 6, 6, 14, 50, 6, 34, 2, 6, 2, 82, 2,…
## $ origen_financiamiento            <chr> "Privado", "PĂºblico", "Privado", "PĂºâ€¦
## $ clasificacion                    <chr> "Caso Descartado", "Caso Descartado"…
## $ clasificacion_resumen            <chr> "Descartado", "Descartado", "Descart…
## $ residencia_provincia_id          <int> 2, 6, 14, 50, 6, 34, 2, 6, 2, 82, 2,…
## $ fecha_diagnostico                <chr> "2020-06-09", "2020-06-01", "2020-06…
## $ residencia_departamento_id       <int> 0, 427, 14, 49, 515, 35, 0, 260, 7, …
## $ ultima_actualizacion             <chr> "2020-12-31", "2020-12-31", "2020-12…

4 Data cleaning

Our first step is to clean up the Data.

Some people can be as old as 1000 years old, which is clearly incorrect. We will only consider those 100 years old or younger. We will format the date strings into a proper date type. We will not consider the cases who were discarded as not having Covid, however we will keep the suspicious cases and treat them as positives.

cases <- filter(cases, clasificacion_resumen != "Descartado")
cases <- filter(cases, edad < 100 )
cases$fecha_apertura <- as.Date(cases$fecha_apertura, "%Y-%m-%d")
cases$fecha_fallecimiento <- as.Date(cases$fecha_fallecimiento, "%Y-%m-%d")

5 Analysis

5.1 Number of cases.

length(cases$fallecido)
## [1] 1976907

5.2 Cases by province.

group_by(cases, residencia_provincia_nombre) %>% summarise(total = n()) %>% arrange(desc(total))
## # A tibble: 25 x 2
##    residencia_provincia_nombre  total
##    <chr>                        <int>
##  1 Buenos Aires                803344
##  2 Santa Fe                    199847
##  3 CABA                        189521
##  4 CĂ³rdoba                     167376
##  5 TucumĂ¡n                     117211
##  6 Mendoza                      73765
##  7 Neuquén                      49196
##  8 RĂ­o Negro                    43299
##  9 Chubut                       39422
## 10 Entre RĂ­os                   37620
## # … with 15 more rows
argentina <- readOGR(dsn = "ARG_adm", layer = "ARG_adm1", use_iconv=TRUE, encoding='UTF-8', stringsAsFactors=FALSE, verbose = FALSE)
cases_by_province <- group_by(cases, residencia_provincia_nombre) %>% 
  summarise(total = n()) %>%
  filter(residencia_provincia_nombre != "SIN ESPECIFICAR") %>% 
  mutate(NAME_1 = residencia_provincia_nombre) %>% 
  mutate(NAME_1 = replace(NAME_1, NAME_1=="CABA","Ciudad de Buenos Aires"))

argentina@data <- left_join(argentina@data,cases_by_province, by = c("NAME_1"))

state_popup <- paste0("<strong>Estado: </strong>", 
                      argentina$NAME_1, 
                      "<br><strong>Casos: </strong>", 
                      argentina@data$total)

pal <- colorQuantile("YlGn", NULL, n = 5)
leaflet(data = argentina) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addPolygons(fillColor = ~pal(total), 
              fillOpacity = 0.8, 
              color = "#BDBDC3", 
              weight = 1, 
              popup = state_popup)

5.3 Deaths by province

filter(cases, fallecido == "SI") %>%  group_by(residencia_provincia_nombre) %>% summarise(total = n()) %>% arrange(desc(total))
argentina <- readOGR(dsn = "ARG_adm", layer = "ARG_adm1", use_iconv=TRUE, encoding='UTF-8', stringsAsFactors=FALSE, verbose = FALSE)
cases_by_province <- filter(cases, fallecido == "SI") %>% group_by(residencia_provincia_nombre) %>% 
  summarise(total = n()) %>%
  filter(residencia_provincia_nombre != "SIN ESPECIFICAR") %>% 
  mutate(NAME_1 = residencia_provincia_nombre) %>% 
  mutate(NAME_1 = replace(NAME_1, NAME_1=="CABA","Ciudad de Buenos Aires"))
argentina@data <- left_join(argentina@data,cases_by_province, by = c("NAME_1"))
state_popup <- paste0("<strong>Estado: </strong>", 
                      argentina$NAME_1, 
                      "<br><strong> Muertes: </strong>", 
                      argentina@data$total)


pal <- colorQuantile("YlGn", NULL, n = 5)
leaflet(data = argentina) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addPolygons(fillColor = ~pal(total), 
              fillOpacity = 0.8, 
              color = "#BDBDC3", 
              weight = 1, 
              popup = state_popup)

5.5 Mortality rate

The general mortality rate is given by the amount of people who died of Covid, divided those who got it, in percentage it is.

sum(cases$fallecido == "SI") * 100 / length(cases$fallecido)
## [1] 2.253267

Grouping by gender, we get.

print.data.frame(group_by(cases, sexo) %>% summarise(mortality = sum(fallecido == "SI") * 100 / length(fallecido)))
##   sexo mortality
## 1    F  1.915604
## 2    M  2.579833
## 3   NR  5.048424

It seems that COVID affects men at a much higher rate than it does women, which may be due to women being just less likely to get covid. In order to confirm this, we will extract how many males and females in this dataset are.

print.data.frame(group_by(cases, sexo) %>% summarise(total = n() ))
##   sexo  total
## 1    F 989975
## 2    M 982079
## 3   NR   4853

Male and females are represented almost equally in the Dataset. Which mean that men are more likely to die than women. There could be many reasons for this, doing some research it may just be that women have stronger immune systems and men are more likely to have comorbidities that make COVID more deadly.

5.6 The mortality rate by age

age_mortality <- group_by(cases, edad) %>% summarise(mortality = sum(fallecido == "SI") * 100 / length(fallecido)) 
r_value = summary(lm(mortality~edad, data = age_mortality))$adj.r.squared
ggplot(data = age_mortality) + 
  geom_point(mapping = aes(x = edad , y = mortality)) +
  geom_smooth(mapping = aes(x = edad , y = mortality)) +
  labs(y = "Mortality(Percentage)", x = "Age", title = "Mortality by age") +
  annotate("text", x = 20, y = 50, label = paste("R^2: ",r_value))

As most people already know, the older you are, the more likely you are to die of COVID. Up until age 50, it seems very unlikely but then the mortality jumps almost linearly until reaching almost 40-50% for those 90 and older.

We can divide it into two graphs to see it more clearly.

age_mortality <- filter(cases, edad <50) %>% group_by(edad) %>% summarise(mortality = sum(fallecido == "SI") * 100 / length(fallecido)) 
r_value = summary(lm(mortality~edad, data = age_mortality))$adj.r.squared
ggplot(data = age_mortality) + 
  geom_point(mapping = aes(x = edad , y = mortality)) +
  geom_smooth(mapping = aes(x = edad , y = mortality)) +
  labs(y = "Mortality(Percentage)", x = "Age", title = "Mortality by age") +
  annotate("text", x = 20, y = 1, label = paste("R^2: ",r_value))

age_mortality <- filter(cases, edad > 50) %>% group_by(edad) %>% summarise(mortality = sum(fallecido == "SI") * 100 / length(fallecido)) 
r_value = summary(lm(mortality~edad, data = age_mortality))$adj.r.squared
ggplot(data = age_mortality) + 
  geom_point(mapping = aes(x = edad , y = mortality)) +
  geom_smooth(mapping = aes(x = edad , y = mortality)) +
  labs(y = "Mortality(Percentage)", x = "Age", title = "Mortality by age")+
  annotate("text", x = 60, y = 50, label = paste("R^2: ",r_value))

For this first graph the R^2 is very small, meaning if you’re under 50, your age isn’t really that import since overall you’re not at risk, it probably depends more on whether you have a serious condition that may make you weak to the Virus.

However, if you’re over 50, Age becomes a main factor in deciding whether you’ll pass it or not.

5.7 Number of cases per day.

cases_by_day <- group_by(cases, fecha_apertura) %>% summarise(total = n())

ggplot(data = cases_by_day) + 
  geom_point(mapping = aes(x = fecha_apertura , y = total), na.rm = TRUE) +
  geom_smooth(method = "loess",mapping = aes(x = fecha_apertura , y = total), na.rm = TRUE,formula = y ~ x) +
  labs(y = "Date", x = "Total", title = "Cases per day")

After Octobers cases seem to have diminished, even though in October restrictions had dimished.

5.8 Numbers of deaths per day

daily_deaths <- filter(cases, fallecido == "SI")  %>% group_by(fecha_apertura) %>% summarise(mortality =n())

ggplot(data = daily_deaths) + 
  geom_point(mapping = aes(x = fecha_apertura , y = mortality), na.rm = TRUE) +
  geom_smooth(method = "loess",mapping = aes(x = fecha_apertura , y = mortality), na.rm = TRUE,formula = y ~ x) +
  labs(y = "Deaths", x = "Date", title = "Deaths per day")

As with cases, the amount of deaths peaked in October and diminished after it.

5.9 How many days after being diagnosed did people who die died?

mortality_days = filter(cases, fallecido == "SI") %>% mutate(difference = fecha_fallecimiento - fecha_apertura) %>% filter(difference > 0 & difference < 40)  %>% group_by(difference) %>% summarise(total = n()) %>% mutate(ptg = prop.table(total)*100)

ggplot(data = mortality_days) + 
  geom_point(mapping = aes(x = difference , y = cumsum(ptg))) +
  labs(y = "Cumulative Percentage", x = "Days after diagnosis", title = "How many days after diagnosis do people who died die?(Cumulative)")

50% died 10 days after their diagnosis, so it’s safe to say most of these people were diagnosed too late. They probably only got to the hospital once they needed hospitalization, meaning they could have been infected for over 10 days.